import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv("../data/raw/Employee-Attrition.csv")
df.head()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
df.shape
(1470, 35)
def generate_meta_signals(df):
meta_df = df.isna().sum().reset_index()
meta_df['% of Total Values'] = meta_df[0]/len(df)
meta_df = pd.concat([meta_df, pd.DataFrame(df.dtypes, columns=['dtypes']).reset_index(drop=True)], axis=1)
d = df.describe().T.reset_index()
meta_df = meta_df.merge(d, on=['index'], how='left')
vcs = [pd.DataFrame(df[x].value_counts(normalize=True)).T for x in
list(df.columns)]
vcs= [pd.DataFrame((x.idxmax(axis=1), x.max(axis=1))).T.reset_index() for x in vcs if len(list(x.columns)) > 0]
meta_df = meta_df.merge(pd.concat(vcs), on=['index'], how='left')
meta_df.columns = ['Index', 'Missing Values', '% of Total Values', 'dtypes', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'Most Frequently Value','% of MF Values']
return meta_df
def bad_flag(row):
if row['std'] == 0:
return True
# elif (row['25%'] == row['50%']) and (row['50%'] == row['75%']):
# return True
elif row['% of Total Values'] > .15:
return True
elif row['% of MF Values'] > .80:
return True
elif pd.isnull(row['% of MF Values']):
return True
else:
return False
meta_df = generate_meta_signals(df)
meta_df['is Trash var'] = meta_df.apply(lambda r: bad_flag(r), axis=1)
meta_df.head(35)
| Index | Missing Values | % of Total Values | dtypes | count | mean | std | min | 25% | 50% | 75% | max | Most Frequently Value | % of MF Values | is Trash var | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Age | 0 | 0.0 | int64 | 1470.0 | 36.923810 | 9.135373 | 18.0 | 30.00 | 36.0 | 43.00 | 60.0 | 35.0 | 0.053061 | False |
| 1 | Attrition | 0 | 0.0 | object | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | No | 0.838776 | True |
| 2 | BusinessTravel | 0 | 0.0 | object | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Travel_Rarely | 0.709524 | False |
| 3 | DailyRate | 0 | 0.0 | int64 | 1470.0 | 802.485714 | 403.509100 | 102.0 | 465.00 | 802.0 | 1157.00 | 1499.0 | 691.0 | 0.004082 | False |
| 4 | Department | 0 | 0.0 | object | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Research & Development | 0.653741 | False |
| 5 | DistanceFromHome | 0 | 0.0 | int64 | 1470.0 | 9.192517 | 8.106864 | 1.0 | 2.00 | 7.0 | 14.00 | 29.0 | 2.0 | 0.143537 | False |
| 6 | Education | 0 | 0.0 | int64 | 1470.0 | 2.912925 | 1.024165 | 1.0 | 2.00 | 3.0 | 4.00 | 5.0 | 3.0 | 0.389116 | False |
| 7 | EducationField | 0 | 0.0 | object | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Life Sciences | 0.412245 | False |
| 8 | EmployeeCount | 0 | 0.0 | int64 | 1470.0 | 1.000000 | 0.000000 | 1.0 | 1.00 | 1.0 | 1.00 | 1.0 | 1.0 | 1.0 | True |
| 9 | EmployeeNumber | 0 | 0.0 | int64 | 1470.0 | 1024.865306 | 602.024335 | 1.0 | 491.25 | 1020.5 | 1555.75 | 2068.0 | 2048.0 | 0.00068 | False |
| 10 | EnvironmentSatisfaction | 0 | 0.0 | int64 | 1470.0 | 2.721769 | 1.093082 | 1.0 | 2.00 | 3.0 | 4.00 | 4.0 | 3.0 | 0.308163 | False |
| 11 | Gender | 0 | 0.0 | object | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Male | 0.6 | False |
| 12 | HourlyRate | 0 | 0.0 | int64 | 1470.0 | 65.891156 | 20.329428 | 30.0 | 48.00 | 66.0 | 83.75 | 100.0 | 66.0 | 0.019728 | False |
| 13 | JobInvolvement | 0 | 0.0 | int64 | 1470.0 | 2.729932 | 0.711561 | 1.0 | 2.00 | 3.0 | 3.00 | 4.0 | 3.0 | 0.590476 | False |
| 14 | JobLevel | 0 | 0.0 | int64 | 1470.0 | 2.063946 | 1.106940 | 1.0 | 1.00 | 2.0 | 3.00 | 5.0 | 1.0 | 0.369388 | False |
| 15 | JobRole | 0 | 0.0 | object | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Sales Executive | 0.221769 | False |
| 16 | JobSatisfaction | 0 | 0.0 | int64 | 1470.0 | 2.728571 | 1.102846 | 1.0 | 2.00 | 3.0 | 4.00 | 4.0 | 4.0 | 0.312245 | False |
| 17 | MaritalStatus | 0 | 0.0 | object | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Married | 0.457823 | False |
| 18 | MonthlyIncome | 0 | 0.0 | int64 | 1470.0 | 6502.931293 | 4707.956783 | 1009.0 | 2911.00 | 4919.0 | 8379.00 | 19999.0 | 2342.0 | 0.002721 | False |
| 19 | MonthlyRate | 0 | 0.0 | int64 | 1470.0 | 14313.103401 | 7117.786044 | 2094.0 | 8047.00 | 14235.5 | 20461.50 | 26999.0 | 4223.0 | 0.002041 | False |
| 20 | NumCompaniesWorked | 0 | 0.0 | int64 | 1470.0 | 2.693197 | 2.498009 | 0.0 | 1.00 | 2.0 | 4.00 | 9.0 | 1.0 | 0.354422 | False |
| 21 | Over18 | 0 | 0.0 | object | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Y | 1.0 | True |
| 22 | OverTime | 0 | 0.0 | object | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | No | 0.717007 | False |
| 23 | PercentSalaryHike | 0 | 0.0 | int64 | 1470.0 | 15.209524 | 3.659938 | 11.0 | 12.00 | 14.0 | 18.00 | 25.0 | 11.0 | 0.142857 | False |
| 24 | PerformanceRating | 0 | 0.0 | int64 | 1470.0 | 3.153741 | 0.360824 | 3.0 | 3.00 | 3.0 | 3.00 | 4.0 | 3.0 | 0.846259 | True |
| 25 | RelationshipSatisfaction | 0 | 0.0 | int64 | 1470.0 | 2.712245 | 1.081209 | 1.0 | 2.00 | 3.0 | 4.00 | 4.0 | 3.0 | 0.312245 | False |
| 26 | StandardHours | 0 | 0.0 | int64 | 1470.0 | 80.000000 | 0.000000 | 80.0 | 80.00 | 80.0 | 80.00 | 80.0 | 80.0 | 1.0 | True |
| 27 | StockOptionLevel | 0 | 0.0 | int64 | 1470.0 | 0.793878 | 0.852077 | 0.0 | 0.00 | 1.0 | 1.00 | 3.0 | 0.0 | 0.429252 | False |
| 28 | TotalWorkingYears | 0 | 0.0 | int64 | 1470.0 | 11.279592 | 7.780782 | 0.0 | 6.00 | 10.0 | 15.00 | 40.0 | 10.0 | 0.137415 | False |
| 29 | TrainingTimesLastYear | 0 | 0.0 | int64 | 1470.0 | 2.799320 | 1.289271 | 0.0 | 2.00 | 3.0 | 3.00 | 6.0 | 2.0 | 0.372109 | False |
| 30 | WorkLifeBalance | 0 | 0.0 | int64 | 1470.0 | 2.761224 | 0.706476 | 1.0 | 2.00 | 3.0 | 3.00 | 4.0 | 3.0 | 0.607483 | False |
| 31 | YearsAtCompany | 0 | 0.0 | int64 | 1470.0 | 7.008163 | 6.126525 | 0.0 | 3.00 | 5.0 | 9.00 | 40.0 | 5.0 | 0.133333 | False |
| 32 | YearsInCurrentRole | 0 | 0.0 | int64 | 1470.0 | 4.229252 | 3.623137 | 0.0 | 2.00 | 3.0 | 7.00 | 18.0 | 2.0 | 0.253061 | False |
| 33 | YearsSinceLastPromotion | 0 | 0.0 | int64 | 1470.0 | 2.187755 | 3.222430 | 0.0 | 0.00 | 1.0 | 3.00 | 15.0 | 0.0 | 0.395238 | False |
| 34 | YearsWithCurrManager | 0 | 0.0 | int64 | 1470.0 | 4.123129 | 3.568136 | 0.0 | 2.00 | 3.0 | 7.00 | 17.0 | 2.0 | 0.234014 | False |
From this table we see that:
df.drop(['EmployeeCount', 'Over18', 'StandardHours'], axis=1, inplace=True)
from sklearn.preprocessing import LabelEncoder
map_list = []
le_name_mapping = dict()
le = LabelEncoder()
df["Attrition"] = le.fit_transform(df["Attrition"])
le_name_mapping["Attrition"] = dict(zip(le.classes_, le.transform(le.classes_)))
import plotly.express as px
fig = px.pie(df, "Attrition", title="Attrition distribution", hole=.3)
fig.show()
our target variable is imbalanced (84% of employees haven't shifted from the company)
attr_corr = df[(df.select_dtypes(include=np.number).columns.difference(['Attrition']))].corrwith(df["Attrition"]).sort_values(ascending=False)
df_attr_corr = pd.DataFrame(attr_corr)
fig = px.imshow(df_attr_corr, color_continuous_scale='Viridis')
fig.update_layout(
height=800
)
fig.show()
Tbh there is no significant correlation between any feature with 'Attrition'
df_corr = df[(df.select_dtypes(include=np.number).columns.difference(['Attrition']))].corr()
text = df_corr.values.tolist()
fig = px.imshow(df_corr, color_continuous_scale='Viridis', aspect="auto")
fig.update_xaxes(side="top")
fig.show()
We can notice a high correlation between
fig = px.histogram(df, x="TotalWorkingYears", color="Attrition")
fig.show()
df["TotalWorkingYears"].corr(df["Attrition"])
-0.17106324613622667
Looking at the graph above we can tell that experience (TotalWorkingYears) has a very weak correlation with employee attrition so we can say that experience is not a major factor on employee attrition
fig = px.histogram(df, x="Age", color="Attrition")
fig.show()
df["Age"].corr(df["Attrition"])
-0.1592050068657797
Age is same as experience (TotalWorkingYears) as they both doesn't have strong correlations towards to employees attrition
fig = px.histogram(df, x="Gender", color="Attrition", barmode="group")
fig.show()
From the graphs above, all of them has a very weak to no correlations towards employees attrition
fig = px.histogram(df, x='BusinessTravel', color='Attrition', barmode="group")
fig.show()
fig = px.histogram(df, x='OverTime', color='Attrition', barmode="group")
fig.show()
pd.crosstab(df['OverTime'], df['Gender'])
| Gender | Female | Male |
|---|---|---|
| OverTime | ||
| No | 408 | 646 |
| Yes | 180 | 236 |
fig = px.histogram(df, x="MonthlyIncome", color="Attrition")
fig.show()
df[df.columns.difference(["Attrition"])].head(2).to_csv("../data/request.csv", index=[0])
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
# categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
# print(categorical_columns)
# for cat_col in categorical_columns:
# le = LabelEncoder()
# df[cat_col] = le.fit_transform(df[cat_col])
# le_name_mapping[cat_col] = dict(zip(le.classes_, le.transform(le.classes_)))
# le_name_mapping
df_train, df_test = train_test_split(df, test_size=.2, random_state=1)
y_train, y_test = df_train['Attrition'], df_test['Attrition']
for df_ in [df_train, df_test]:
del df_['Attrition']
y = y_train.append(y_test, ignore_index = True)
train_dict = df_train.to_dict(orient='records')
test_dict = df_test.to_dict(orient='records')
# sparse - is a matrix that is comprised of mostly zero values.
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
DictVectorizer(sparse=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DictVectorizer(sparse=False)
X_train = dv.transform(train_dict)
X_test = dv.transform(test_dict)
X = np.concatenate((X_train, X_test))
dv.get_feature_names()
C:\Users\User\Anaconda\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
['Age', 'BusinessTravel=Non-Travel', 'BusinessTravel=Travel_Frequently', 'BusinessTravel=Travel_Rarely', 'DailyRate', 'Department=Human Resources', 'Department=Research & Development', 'Department=Sales', 'DistanceFromHome', 'Education', 'EducationField=Human Resources', 'EducationField=Life Sciences', 'EducationField=Marketing', 'EducationField=Medical', 'EducationField=Other', 'EducationField=Technical Degree', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender=Female', 'Gender=Male', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole=Healthcare Representative', 'JobRole=Human Resources', 'JobRole=Laboratory Technician', 'JobRole=Manager', 'JobRole=Manufacturing Director', 'JobRole=Research Director', 'JobRole=Research Scientist', 'JobRole=Sales Executive', 'JobRole=Sales Representative', 'JobSatisfaction', 'MaritalStatus=Divorced', 'MaritalStatus=Married', 'MaritalStatus=Single', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime=No', 'OverTime=Yes', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
numerical_vars = list(df[df.columns.difference(["Attrition"])].select_dtypes(include=np.number).columns)
aucs = dict()
for var in numerical_vars:
auc_ = roc_auc_score(y_train, df_train[var])
if auc_ < 0.5:
auc_ = roc_auc_score(y_train, -df_train[var])
aucs[var] = auc_
sorted(aucs.items(), key=lambda x: x[1])
[('MonthlyRate', 0.501490505034657),
('Education', 0.5064663263533618),
('HourlyRate', 0.5100384953743913),
('PerformanceRating', 0.510694093453545),
('EmployeeNumber', 0.5113496915326987),
('RelationshipSatisfaction', 0.514397942430644),
('NumCompaniesWorked', 0.5167317595243832),
('TrainingTimesLastYear', 0.5218028386836487),
('PercentSalaryHike', 0.5256747897323254),
('YearsSinceLastPromotion', 0.5261370704291646),
('DailyRate', 0.5464466023769633),
('JobSatisfaction', 0.5560480323652521),
('WorkLifeBalance', 0.56864448092882),
('EnvironmentSatisfaction', 0.5697231358881112),
('JobInvolvement', 0.5741946509920823),
('DistanceFromHome', 0.5742198663028191),
('StockOptionLevel', 0.6228742092198383),
('Age', 0.6255666440662772),
('YearsWithCurrManager', 0.6292396743302534),
('JobLevel', 0.6324560272997765),
('YearsInCurrentRole', 0.6384432627491413),
('TotalWorkingYears', 0.6402727736281469),
('YearsAtCompany', 0.6428167183113587),
('MonthlyIncome', 0.6477112902954674)]
min_max = MinMaxScaler()
min_max.fit(X_train)
X_train_scaled = min_max.transform(X_train)
X_test_scaled = min_max.transform(X_test)
X_scaled = np.concatenate((X_train_scaled, X_test_scaled))
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve
log_clf = LogisticRegression()
svc_clf = SVC()
# knn_clf = KNeighborsClassifier()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
cat_clf = CatBoostClassifier()
xgb_clf = XGBClassifier()
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
scores = dict()
for clf in [log_clf, svc_clf, dt_clf, rf_clf, cat_clf, xgb_clf]:
clf_list = []
for train_idxs, test_idxs in kfold.split(X):
X_train, X_test = X[train_idxs], X[test_idxs]
y_train, y_test = y[train_idxs], y[test_idxs]
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf_list.append(roc_auc_score(y_test, y_pred))
scores[clf.__class__.__name__] = clf_list
for key in scores.keys():
print(f'{key}' + ' ' + f'{np.mean(scores[key]):.3f} +- {np.std(scores[key]):.3f}')
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
scores = dict()
for clf in [log_clf, svc_clf, dt_clf, rf_clf, cat_clf, xgb_clf]:
clf_list = []
for train_idxs, test_idxs in kfold.split(X_scaled):
X_train_scaled, X_test_scaled = X_scaled[train_idxs], X_scaled[test_idxs]
y_train_scaled, y_test_scaled = y[train_idxs], y[test_idxs]
clf.fit(X_train_scaled, y_train_scaled)
y_pred = clf.predict(X_test_scaled)
clf_list.append(roc_auc_score(y_test_scaled, y_pred))
scores[clf.__class__.__name__] = clf_list
for key in scores.keys():
print(f'{key}' + ' ' + f'{np.mean(scores[key]):.3f} +- {np.std(scores[key]):.3f}')
This is reason why Feature Scaling is used. by scaling down the features we are able to achieve better accuracy
from sklearn.model_selection import RandomizedSearchCV
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['none', 'l1', 'l2', 'elasticnet']
c_values = [100, 10, 1.0, 0.1, 0.01]
lr_grid = dict(solver=solvers, penalty=penalty, C=c_values)
print(lr_grid)
lr_random = RandomizedSearchCV(log_clf, param_distributions=lr_grid, scoring='roc_auc', cv=5, n_iter=20,
random_state=1, n_jobs=-1, verbose=1)
lr_random.fit(X_train_scaled, y_train_scaled)
y_pred_lr = lr_random.predict_proba(X_test_scaled)[:, 1]
print(lr_random.best_score_)
print(lr_random.best_params_)
# n_neighbors = [int(x) for x in np.arange(1,22,1)]
# metric = ['eucledian', 'manhattan', 'minkowski']
# weights = ['uniform', 'distance']
# knn_grid = dict(n_neighbors=n_neighbors, weights=weights, metric=metric)
# print(knn_grid)
# knn_random = RandomizedSearchCV(knn_clf, param_distributions=knn_grid, scoring='roc_auc', cv=5, n_iter=20,
# random_state=1, n_jobs=-1, verbose=1)
# knn_random.fit(X_train_scaled, y_train_scaled)
# y_pred_knn = knn_random.predict_proba(X_test_scaled)[:, 1]
# print(knn_random.best_score_)
# print(knn_random.best_params_)
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
svc_grid = dict(kernel=kernel,C=C,gamma=gamma)
print(svc_grid)
svc_random = RandomizedSearchCV(svc_clf, param_distributions=svc_grid, scoring='roc_auc', cv=5, n_iter=20,
random_state=1, n_jobs=-1, verbose=1)
svc_random.fit(X_train_scaled, y_train_scaled)
print(svc_random.best_score_)
print(svc_random.best_params_)
#number of trees in forest
n_estimators = [int(x) for x in np.linspace(100,1200,12)]
#max depth of tree
max_depth = [int(x) for x in np.linspace(5,30,6)]
#quality of split
criterion = ['gini','entropy']
#min no. of samples to consider for splitting a internal node
min_samples_split = [2,5,7,10]
#min number of node can be as leaf node
min_samples_leaf = [2,5,8]
# The number of features to consider when looking for the best split:
max_features = ["auto","sqrt"]
random_grid = dict(n_estimators=n_estimators, max_depth=max_depth, criterion=criterion,
min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf,
max_features=max_features)
print(random_grid)
rf_random = RandomizedSearchCV(rf_clf, param_distributions=random_grid, scoring='roc_auc', cv=5, n_iter=20,
random_state=1, n_jobs=-1, verbose=1)
rf_random.fit(X_train_scaled, y_train_scaled)
y_pred_rf = rf_random.predict_proba(X_test_scaled)[:, 1]
print(rf_random.best_score_)
print(rf_random.best_params_)
n_estimators = [int(x) for x in np.linspace(100, 1000, 10)]
max_depth = [int(x) for x in np.linspace(6, 30, 5)]
learning_rate = [x for x in np.arange(0.001, 0.4, 0.001)]
l2_leaf_reg = [x for x in np.arange(0, 4, 0.1)]
min_data_in_leaf = np.random.randint(20, 200, 5)
cat_grid = dict(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
l2_leaf_reg=l2_leaf_reg, min_data_in_leaf=min_data_in_leaf)
# cat_random = RandomizedSearchCV(cat_clf, param_distributions=cat_grid, cv=5, random_state=1,
# n_iter=20, scoring="roc_auc", n_jobs=-1, verbose=1)
# cat_random.fit(X_train_scaled,y_train_scaled)
# y_pred_cat = cat_random.predict_proba(X_test_scaled)[:, 1]
# print("score: ", cat_random.best_score_)
# print("best_params: \n", cat_random.best_params_)
n_estimators = [int(x) for x in np.linspace(100, 1000, 10)]
max_depth = [int(x) for x in np.linspace(6, 30, 5)]
learning_rate = [x for x in np.arange(0.001, 0.4, 0.001)]
# how many observations we need to have in a leaf node
min_child_weight = list(range(1, 10))
xg_grid = dict(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
min_child_weight=min_child_weight)
print(xg_grid)
xgb_random = RandomizedSearchCV(xgb_clf, param_distributions=xg_grid, cv=5, random_state=1,
n_iter=20, scoring="roc_auc", n_jobs=-1, verbose=1)
xgb_random.fit(X_train_scaled, y_train_scaled)
y_pred_xgb = xgb_random.predict_proba(X_test_scaled)[:, 1]
print("score: ",xgb_random.best_score_)
print("best_params: \n",xgb_random.best_params_)
thresholds = np.linspace(0, 1, 101)
def confusion_matrix_dataframe(y_val, y_pred):
scores = []
for t in thresholds:
actual_positive = (y_val == 1)
actual_negative = (y_val == 0)
predict_positive = (y_pred >= t)
predict_negative = (y_pred < t)
tp = (predict_positive & actual_positive).sum()
tn = (predict_negative & actual_negative).sum()
fp = (predict_positive & actual_negative).sum()
fn = (predict_negative & actual_positive).sum()
tpr = tp/(tp + fn)
fpr = fp/(fp + tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * precision * recall / (precision+recall)
gmean = np.sqrt(tpr * (1-fpr))
auc = (1+tpr-fpr)/2
scores.append((t, tp, fp, fn, tn, tpr, fpr, precision, recall, f1, gmean, auc))
columns = ['threshold', 'tp', 'fp', 'fn', 'tn', 'tpr', 'fpr', 'precision', 'recall', 'f1', 'gmean', 'auc']
df_scores = pd.DataFrame(scores, columns=columns)
return df_scores
df_scores = confusion_matrix_dataframe(y_test_scaled, y_pred_lr)
df_scores.head()
import plotly.graph_objects as go
# locate the index of the largest g-mean
ix = np.argmax(df_scores.gmean)
print('Best Threshold=%f, G-Mean=%.3f' % (df_scores.threshold[ix], df_scores.gmean[ix]))
df_scores[df_scores.precision==df_scores.recall]
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_scores.threshold, y=df_scores.precision,
mode='lines',
name='precision'))
fig.add_trace(go.Scatter(x=df_scores.threshold, y=df_scores.recall,
mode='lines',
name='recall'))
fig.add_vline(0.26, line_dash="dot")
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_scores.threshold, y=df_scores.f1,
mode='lines',
name='lines'))
fig.add_vline(0.25, line_dash="dot")
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_scores.fpr, y=df_scores.tpr,
mode='lines'))
fig.update_layout(
title="ROC Curve",
xaxis_title="False Positive Rate",
yaxis_title="True Positive Rate",
)
fig.add_vline(0.15, line_dash="dot")
fig.show()
!jupyter nbconvert --to html "EDA, Modelling + Tuning.ipynb"